library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)


options(
  tigris_class = "sf",
  tigris_use_cache = TRUE
)

Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")

Load social distancing data and blockgroups

Load the Safegraph social distancing data and San Jose blockgroups

# get SJ blockgroups 
# get San Jose block groups
scc_blockgroups <- block_groups("CA","Santa Clara", cb=F, progress_bar=F)

# Use tracts sent to us by San Jose
sj_tracts <- st_read("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Data Library/San_Jose/CSJ_Census_Tracts/CSJ_Census_Tracts.shp") %>%
  st_as_sf() %>%
  st_transform(st_crs(scc_blockgroups))
## Reading layer `CSJ_Census_Tracts' from data source `/Users/simonespeizer/pCloud Drive/Shared/SFBI/Data Library/San_Jose/CSJ_Census_Tracts/CSJ_Census_Tracts.shp' using driver `ESRI Shapefile'
## Simple feature collection with 219 features and 9 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 6112856 ymin: 1869687 xmax: 6255982 ymax: 1996555
## epsg (SRID):    2227
## proj4string:    +proj=lcc +lat_1=38.43333333333333 +lat_2=37.06666666666667 +lat_0=36.5 +lon_0=-120.5 +x_0=2000000.0001016 +y_0=500000.0001016001 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=us-ft +no_defs
sj_citycouncil_disticts <- st_read("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Data Library/San_Jose/City Council Districts/CITY_COUNCIL_DISTRICTS.shp") %>%
  st_as_sf() %>%
  st_transform(st_crs(scc_blockgroups))
## Reading layer `CITY_COUNCIL_DISTRICTS' from data source `/Users/simonespeizer/pCloud Drive/Shared/SFBI/Data Library/San_Jose/City Council Districts/CITY_COUNCIL_DISTRICTS.shp' using driver `ESRI Shapefile'
## Simple feature collection with 10 features and 7 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 6112856 ymin: 1869687 xmax: 6255982 ymax: 1996555
## epsg (SRID):    2227
## proj4string:    +proj=lcc +lat_1=38.43333333333333 +lat_2=37.06666666666667 +lat_0=36.5 +lon_0=-120.5 +x_0=2000000.0001016 +y_0=500000.0001016001 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=us-ft +no_defs
# from code written by others to get SJ blockgroups
sj_blockgroups <-
  scc_blockgroups %>%
  st_centroid() %>%
  st_join(sj_tracts, left = F) %>%
  st_join(sj_citycouncil_disticts%>% dplyr::select(DISTRICTS)) %>%
  mutate(
    DISTRICTS = DISTRICTS %>% factor(levels = c("1","2","3","4","5","6","7","8","9","10"))
  ) %>%
  st_set_geometry(NULL) %>%
  left_join(scc_blockgroups%>% dplyr::select(GEOID), by = "GEOID") %>%
  st_as_sf() %>%
  dplyr::select(GEOID, DISTRICTS)

# the spatial join leaves off two blockgroups which are touching district 9. The following code assigns those to district 9
sj_blockgroups$DISTRICTS[is.na(sj_blockgroups$DISTRICTS)] <- 9

# code from others in the class to get social distancing data 
sj_socialdistancing <- readRDS("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/sj_socialdistancing.rds") %>% 
  mutate(date = date_range_start %>%  substr(1,10) %>% as.Date()) %>% 
  left_join(sj_blockgroups, by = c("origin_census_block_group" = "GEOID")) %>% 
  filter(!is.na(DISTRICTS))

# obtaining weekends
weekends <-
  sj_socialdistancing %>% 
  filter(!duplicated(date)) %>% 
  arrange(date) %>% 
  mutate(
    weekend = 
      ifelse(
        (date %>% as.numeric()) %% 7 %in% c(2,3),
        T,
        F
      )
  ) %>% 
  dplyr::select(date,weekend)

sj_socialdistancing <- 
  sj_socialdistancing %>% 
  left_join(weekends)

# date of the shelter in place order
shelter_start <- "2020-03-16" %>% as.Date()

# get average staying at home on weekdays in January and February
sj_pre_sd_at_home_average <- sj_socialdistancing %>% 
  filter(weekend == F) %>% 
  filter(date <  as.Date("2020-03-01")) %>%
  group_by(origin_census_block_group) %>% 
  summarize(completely_home_device_count = sum(completely_home_device_count), device_count = sum(device_count)) %>% 
  mutate(`% Completely at Home Pre Shelter` = (completely_home_device_count/device_count*100) %>% round(1), `% not completely at home pre shelter` = (100 - `% Completely at Home Pre Shelter`))

Obtaining demographic variables

Here I obtain various demographic data, including income (percent below 50% and 80% of area median income), vehicle ownership, age, English language ability, and occupants per room.

# obtain the saved census data 
setwd("~/Documents/2020 Spring Quarter/CEE 218Z")
acs_vars = readRDS("censusData2018_acs_acs5.rds")
setwd("~/Documents/2020 Spring Quarter/CEE 218Z/covid19")
# load in income data - code adapted from other students
sj_median_income_by_block <-
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "B19013_001E"
  ) %>%
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  rename(
    Median_Income = B19013_001E 
  ) %>% 
  filter(!is.na(Median_Income)) %>% 
  left_join(sj_blockgroups, by = c("blockgroup" = "GEOID")) %>% #this code gives each blockgroup a district designation
  filter(
    !is.na(DISTRICTS)
  ) %>% 
  
  # this code joins our census data with the social distancing data, processed as shown below
  left_join(sj_socialdistancing %>%  
                          filter(weekend == F) %>% 
                          filter(date > shelter_start) %>%
                          group_by(origin_census_block_group) %>% 
                          summarize(
                                    completely_home_device_count = sum(completely_home_device_count),
                                    device_count = sum(device_count)) %>% 
                          mutate(`% Completely at Home` = (completely_home_device_count/device_count*100) %>% round(1), 
                                 `% not completely at home` = (100 - `% Completely at Home`)),
            by = c("blockgroup" = "origin_census_block_group")
  ) %>% 
  filter(
    !is.na(device_count)
  ) %>% 
  left_join(sj_pre_sd_at_home_average %>% dplyr::select(origin_census_block_group, `% Completely at Home Pre Shelter`, `% not completely at home pre shelter`), by = c("blockgroup" = "origin_census_block_group"))

sj_ami_by_block <-
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B19001)"
  ) %>%
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
  group_by(blockgroup) %>% 
  summarize(
    Total = B19001_001E,
    `Under 75,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E),
    #sum(lapply(2:12, function(x) as.name(paste0("B19001_00",x,"E"))))
    `Under 100,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E, B19001_013E), 
    `Under 125,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E, B19001_013E, B19001_014E)
  ) %>% 
  mutate(
    `% under 75,000` = `Under 75,000` / Total * 100,
    `% over 75,000` = (100 - `% under 75,000`),
    `% under 100,000` = `Under 100,000` / Total * 100,
    `% over 100,000` = (100 - `% under 100,000`),
    `% under 125,000` = `Under 125,000` / Total * 100,
    `% over 125,000` = (100 - `% under 125,000`),
  ) %>% 
  left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)
  ) %>% 
  filter(!is.na(device_count))
# loading in language data - code adapted from other students
sj_lang_by_block <-
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B16004)"
  )  %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(
    key = "variable",
    value = "estimate", 
    - blockgroup
  ) %>% 
  left_join(acs_vars, by = c("variable" = "name")) %>% 
  mutate(
    tier = substr(label,lapply(label, function(x) max(unlist(gregexpr('!!',x)))+2),nchar(label))
  ) %>% 
  filter(tier %in% c('Speak English "not well"', 
                     'Speak English "not at all"', 
                     'Total', 'Speak Spanish', 
                     'Speak Asian and Pacific Island languages')) %>% 
  group_by(blockgroup, tier) %>% 
  summarise(
    estimate1 = sum(estimate)
  ) %>% 
  spread(
    key = "tier",
    value = "estimate1"
  ) %>% 
  mutate(
    `% speaking english < well` = (`Speak English "not well"` + `Speak English "not at all"`) / Total * 100,
    `% speaking english > well` = (100 - `% speaking english < well`),
    `% speaking spanish` = (`Speak Spanish`/ Total) * 100,
    `% not speaking spanish` = (100 - `% speaking spanish`),
    `% speaking api` = (`Speak Asian and Pacific Island languages` / Total) * 100
  ) %>% 
  left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)) %>% 
  filter(!is.na(device_count)) %>% 
  mutate(log_perc = log(`% speaking english < well`))
# loading in age data - specifically looking at percentage 65+ and percentage <30
sj_age_by_block <- getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B01001)"
  ) %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(
    key = "variable",
    value = "estimate", 
    - blockgroup
  ) %>% 
  mutate(
    label = acs_vars$label[match(variable,acs_vars$name)]
  ) %>% 
  select(-variable) %>% 
  separate(
    label,
    into = c(NA,NA,"sex","age"),
    sep = "!!"
  ) %>% filter(!is.na(age)) %>% 
  mutate(elderly = ifelse(age %in% c("65 and 66 years", "67 to 69 years", "70 to 74 years", "75 to 79 years", "80 to 84 years", "85 years and over"), estimate, NA), `less than 30` = ifelse(age %in% c("Under 5 years", "5 to 9 years", "10 to 14 years", "15 to 17 years", "18 and 19 years", "20 years", "21 years", "22 to 24 years", "25 to 29 years"), estimate, NA)) %>% 
  group_by(blockgroup) %>% 
  summarize(elderly = sum(elderly, na.rm = T), `less than 30` = sum(`less than 30`, na.rm = T), total = sum(estimate, na.rm = T)) %>% 
  mutate(`percent elderly` = elderly*100 / total, `percent less than 30` = `less than 30`*100 / total, `percent nonelderly` = (100 - `percent elderly`)) %>% 
  left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)) %>% 
  filter(!is.na(device_count)) 
# get data on vehicles available as vehicles allocation
sj_vehicles_by_block <- getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B992512)"
  ) %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  dplyr::select(B992512_001E, blockgroup) %>%
  rename(total_vehicles = B992512_001E, blockgroup = blockgroup) %>%
  left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  mutate(`vehicles per capita` = total_vehicles / total) %>%
  filter(!is.na(device_count)) 

# also get data on vehicles available as households without a vehicle
sj_no_vehicles_by_block <- getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B25044)"
  ) %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  select(-variable) %>%
  separate(label, into = c(NA, NA, NA,"vehicles"), sep = "!!") %>% 
  filter(!is.na(vehicles)) %>%
  group_by(blockgroup, vehicles) %>%
  summarize(grouped_vehicles = sum(estimate)) %>%
  spread(key = vehicles, value = grouped_vehicles) %>%
  mutate(total_nums = `1 vehicle available` + `2 vehicles available` + `3 vehicles available` + `4 vehicles available` + `5 or more vehicles available` + `No vehicle available`, `percent no vehicles` = `No vehicle available`*100 / total_nums, `percent with vehicles` = (100-`percent no vehicles`)) %>%
  left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count))
# get data on occupants per room
sj_occupants_per_room_by_block <- getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B25014)"
  ) %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  select(-variable) %>% 
  separate(label, into = c(NA, NA, NA,"occupants per room"), sep = "!!") %>% 
  filter(!is.na(`occupants per room`)) %>%
  group_by(blockgroup, `occupants per room`) %>%
  summarize(estimate_tot = sum(estimate)) %>% 
  spread(key = `occupants per room`, value = estimate_tot) %>%
  mutate(total_nums = `0.50 or less occupants per room` + `0.51 to 1.00 occupants per room` + `1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`, `percent 1 or more` = (`1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`) * 100/ total_nums, `percent less than 1` = (100-`percent 1 or more`)) %>%
  left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count)) 

Testing correlations

In the plots below, I show the selected variables against percent of devices completely at home since the shelter-in-place order started, as well as against percent of devices pre-shelter-in-place for comparison.

Age:

# age
sj_age_by_block %>%
  ggplot(aes(
  x = `percent less than 30`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
labs(
    x = "Percent of residents younger than 30",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Young Age Groups"
  )

young_model <- lm(sj_age_by_block$`% not completely at home` ~ sj_age_by_block$`percent less than 30`)
summary(young_model)
## 
## Call:
## lm(formula = sj_age_by_block$`% not completely at home` ~ sj_age_by_block$`percent less than 30`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.505  -4.978  -0.362   4.274  39.808 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            44.49223    1.51348  29.397  < 2e-16 ***
## sj_age_by_block$`percent less than 30`  0.17973    0.03836   4.685 3.51e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.299 on 567 degrees of freedom
## Multiple R-squared:  0.03726,    Adjusted R-squared:  0.03557 
## F-statistic: 21.95 on 1 and 567 DF,  p-value: 3.513e-06
sj_age_by_block %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
  ggplot(aes(
  x = `percent elderly`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of residents 65 and older",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Elderly Population"
  )

elderly_model <- lm(`% not completely at home` ~ `percent elderly`, sj_age_by_block %>% filter(`percent elderly` < 50))
summary(elderly_model)
## 
## Call:
## lm(formula = `% not completely at home` ~ `percent elderly`, 
##     data = sj_age_by_block %>% filter(`percent elderly` < 50))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.542  -5.120  -0.472   4.248  36.658 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       53.65024    0.78297   68.52   <2e-16 ***
## `percent elderly` -0.17733    0.05406   -3.28   0.0011 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.361 on 564 degrees of freedom
## Multiple R-squared:  0.01872,    Adjusted R-squared:  0.01698 
## F-statistic: 10.76 on 1 and 564 DF,  p-value: 0.001102
# compare this to pre-shelter-in-place behavior
sj_age_by_block %>%
  ggplot(aes(
  x = `percent less than 30`,
  y = `% not completely at home pre shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
labs(
    x = "Percent of residents younger than 30",
    y = "Percent devices leaving home pre-shelter-in-place",
    title = "San Jose: Staying at Home and Young Age Groups Pre Shelter-in-Place"
  )

young_model2 <- lm(sj_age_by_block$`% not completely at home pre shelter` ~ sj_age_by_block$`percent less than 30`)
summary(young_model2)
## 
## Call:
## lm(formula = sj_age_by_block$`% not completely at home pre shelter` ~ 
##     sj_age_by_block$`percent less than 30`)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28.1939  -2.8160  -0.1557   2.9950  16.7071 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            81.87032    0.82253   99.53  < 2e-16 ***
## sj_age_by_block$`percent less than 30` -0.11072    0.02085   -5.31 1.57e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.51 on 567 degrees of freedom
## Multiple R-squared:  0.04738,    Adjusted R-squared:  0.0457 
## F-statistic:  28.2 on 1 and 567 DF,  p-value: 1.573e-07
sj_age_by_block %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
  ggplot(aes(
  x = `percent elderly`,
  y = `% not completely at home pre shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of residents 65 and older",
    y = "Percent devices leaving home on weekdays pre-shelter-in-place",
    title = "San Jose: Staying at Home and Elderly Population Pre Shelter-in-Place"
  )

elderly_model2 <- lm(`% not completely at home pre shelter` ~ `percent elderly`, sj_age_by_block %>% filter(`percent elderly` < 50))
summary(elderly_model2)
## 
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `percent elderly`, 
##     data = sj_age_by_block %>% filter(`percent elderly` < 50))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.236  -2.830  -0.158   3.145  14.296 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        75.9045     0.4257 178.295  < 2e-16 ***
## `percent elderly`   0.1329     0.0294   4.522 7.47e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.546 on 564 degrees of freedom
## Multiple R-squared:  0.03499,    Adjusted R-squared:  0.03328 
## F-statistic: 20.45 on 1 and 564 DF,  p-value: 7.466e-06

Income:

# income - less than $75000
sj_ami_by_block %>% 
  ggplot(aes(
  x = `% over 75,000`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes over $75,000 (50% AMI) annually",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Households Above 50% AMI"
  )

income_75_model <- lm(`% not completely at home` ~ `% over 75,000`, sj_ami_by_block)
summary(income_75_model)
## 
## Call:
## lm(formula = `% not completely at home` ~ `% over 75,000`, data = sj_ami_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.389  -4.795  -0.606   4.185  34.925 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     64.35928    1.11518   57.71   <2e-16 ***
## `% over 75,000` -0.20909    0.01719  -12.16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.444 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2072, Adjusted R-squared:  0.2058 
## F-statistic:   148 on 1 and 566 DF,  p-value: < 2.2e-16
# income - less than $100000
sj_ami_by_block %>% 
  ggplot(aes(
  x = `% over 100,000`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes over $100,000 (80% AMI) annually",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Households Below 80% AMI"
  )

income_100_model <- lm(`% not completely at home` ~ `% over 100,000`, sj_ami_by_block)
summary(income_100_model)
## 
## Call:
## lm(formula = `% not completely at home` ~ `% over 100,000`, data = sj_ami_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.762  -4.639  -0.500   4.176  33.309 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      61.78423    0.87244   70.82   <2e-16 ***
## `% over 100,000` -0.20475    0.01599  -12.80   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.362 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2246, Adjusted R-squared:  0.2232 
## F-statistic: 163.9 on 1 and 566 DF,  p-value: < 2.2e-16
# income - less than $125000
sj_ami_by_block %>% 
  ggplot(aes(
  x = `% over 125,000`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes over $125,000 annually",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Households Below $125,000"
  )

income_125_model <- lm(`% not completely at home` ~ `% over 125,000`, sj_ami_by_block)
summary(income_125_model)
## 
## Call:
## lm(formula = `% not completely at home` ~ `% over 125,000`, data = sj_ami_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.072  -4.669  -0.845   4.018  32.374 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      60.00338    0.73524   81.61   <2e-16 ***
## `% over 125,000` -0.21065    0.01623  -12.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.339 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2294, Adjusted R-squared:  0.228 
## F-statistic: 168.5 on 1 and 566 DF,  p-value: < 2.2e-16
# compare to pre shelter in place
sj_ami_by_block %>% 
  ggplot(aes(
  x = `% over 75,000`,
  y = `% not completely at home pre shelter`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes over $75,000 (50% AMI) annually",
    y = "Percent devices leaving home on weekdays pre-shelter-in-place",
    title = "San Jose: Staying at Home and Households Above 50% AMI Pre Shelter-in-Place"
  )

income_75_model2 <- lm(`% not completely at home pre shelter` ~ `% over 75,000`, sj_ami_by_block)
summary(income_75_model2)
## 
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `% over 75,000`, 
##     data = sj_ami_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28.4357  -2.7003  -0.1437   2.7764  16.6680 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     72.61447    0.65712 110.504  < 2e-16 ***
## `% over 75,000`  0.08029    0.01013   7.926 1.21e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.386 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.09991,    Adjusted R-squared:  0.09832 
## F-statistic: 62.83 on 1 and 566 DF,  p-value: 1.206e-14
# income - less than $100000
sj_ami_by_block %>% 
  ggplot(aes(
  x = `% over 100,000`,
  y = `% not completely at home pre shelter`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes over $100,000 (80% AMI) annually",
    y = "Percent devices leaving home on weekdays pre-shelter-in-place",
    title = "San Jose: Staying Home and Households Below 80% AMI Pre Shelter-in-Place"
  )

income_100_model2 <- lm(`% not completely at home pre shelter` ~ `% over 100,000`, sj_ami_by_block)
summary(income_100_model2)
## 
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `% over 100,000`, 
##     data = sj_ami_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28.5034  -2.6406   0.0803   2.5599  16.9387 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      73.26132    0.51177 143.152   <2e-16 ***
## `% over 100,000`  0.08532    0.00938   9.096   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.319 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1275, Adjusted R-squared:  0.126 
## F-statistic: 82.73 on 1 and 566 DF,  p-value: < 2.2e-16
# over 125000
sj_ami_by_block %>% 
  ggplot(aes(
  x = `% over 125,000`,
  y = `% not completely at home pre shelter`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes over $125,000 annually",
    y = "Percent devices leaving home on weekdays pre-shelter-in-place",
    title = "San Jose: Social Distancing and Households Below $125,000 Pre Shelter-in-Place"
  )

income_125_model2 <- lm(`% not completely at home pre shelter` ~ `% over 125,000`, sj_ami_by_block)
summary(income_125_model2)
## 
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `% over 125,000`, 
##     data = sj_ami_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.353  -2.556   0.022   2.522  16.560 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      73.640242   0.425069   173.2   <2e-16 ***
## `% over 125,000`  0.096607   0.009382    10.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.243 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1578, Adjusted R-squared:  0.1563 
## F-statistic:   106 on 1 and 566 DF,  p-value: < 2.2e-16

Language:

# language
sj_lang_by_block %>% 
  ggplot(aes(
  x = `% speaking english > well`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals speaking English well",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and English Language Ability"
  )

english_ability_model <- lm(`% not completely at home` ~ `% speaking english > well`, sj_lang_by_block)
summary(english_ability_model)
## 
## Call:
## lm(formula = `% not completely at home` ~ `% speaking english > well`, 
##     data = sj_lang_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.467  -5.070  -0.542   3.887  39.230 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 67.28491    3.36107  20.019  < 2e-16 ***
## `% speaking english > well` -0.17915    0.03769  -4.754 2.53e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.294 on 567 degrees of freedom
## Multiple R-squared:  0.03833,    Adjusted R-squared:  0.03663 
## F-statistic:  22.6 on 1 and 567 DF,  p-value: 2.534e-06
sj_lang_by_block %>% 
  ggplot(aes(
  x = `% not speaking spanish`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals not speaking Spanish",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Spanish Language Ability"
  )

spanish_speaking_model <- lm(`% not completely at home` ~ `% not speaking spanish`, sj_lang_by_block)
summary(spanish_speaking_model)
## 
## Call:
## lm(formula = `% not completely at home` ~ `% not speaking spanish`, 
##     data = sj_lang_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.434  -4.585  -0.796   3.568  38.341 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              63.61990    1.32137  48.147   <2e-16 ***
## `% not speaking spanish` -0.15727    0.01646  -9.555   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.849 on 567 degrees of freedom
## Multiple R-squared:  0.1387, Adjusted R-squared:  0.1372 
## F-statistic: 91.29 on 1 and 567 DF,  p-value: < 2.2e-16
# compare to pre shelter in place
sj_lang_by_block %>% 
  ggplot(aes(
  x = `% speaking english > well`,
  y = `% not completely at home pre shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals speaking English well",
    y = "Percent devices leaving home on weekdays pre-shelter-in-place",
    title = "San Jose: Staying at Home and English Language Ability Pre Shelter-in-Place"
  )

english_ability_model2 <- lm(`% not completely at home pre shelter` ~ `% speaking english > well`, sj_lang_by_block)
summary(english_ability_model2)
## 
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `% speaking english > well`, 
##     data = sj_lang_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28.9364  -2.4342   0.0388   3.0316  12.3011 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 60.84131    1.73337  35.100   <2e-16 ***
## `% speaking english > well`  0.18913    0.01943   9.732   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.277 on 567 degrees of freedom
## Multiple R-squared:  0.1431, Adjusted R-squared:  0.1416 
## F-statistic:  94.7 on 1 and 567 DF,  p-value: < 2.2e-16
sj_lang_by_block %>% 
  ggplot(aes(
  x = `% not speaking spanish`,
  y = `% not completely at home pre shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals not speaking Spanish",
    y = "Percent devices leaving home on weekdays pre shelter-in-place",
    title = "San Jose: Staying at Home and Spanish Language Ability Pre Shelter-in-Place"
  )

spanish_speaking_model2 <- lm(`% not completely at home pre shelter` ~ `% not speaking spanish`, sj_lang_by_block)
summary(spanish_speaking_model2)
## 
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `% not speaking spanish`, 
##     data = sj_lang_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.793  -2.540  -0.002   2.680  11.988 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              71.228200   0.726831  97.998   <2e-16 ***
## `% not speaking spanish`  0.082206   0.009054   9.079   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.318 on 567 degrees of freedom
## Multiple R-squared:  0.1269, Adjusted R-squared:  0.1254 
## F-statistic: 82.43 on 1 and 567 DF,  p-value: < 2.2e-16

Occupants per room:

# occupants per room
sj_occupants_per_room_by_block %>% 
  ggplot(aes(
  x = `percent less than 1`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households with 1 or fewer occupant per room",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Room Occupancy"
  )

occupants_model <- lm(`% not completely at home` ~ `percent less than 1`, sj_occupants_per_room_by_block)
summary(occupants_model)
## 
## Call:
## lm(formula = `% not completely at home` ~ `percent less than 1`, 
##     data = sj_occupants_per_room_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.657  -4.937  -0.288   3.901  35.043 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           70.49553    2.95264  23.875  < 2e-16 ***
## `percent less than 1` -0.21239    0.03252  -6.532 1.45e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.062 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.07009,    Adjusted R-squared:  0.06845 
## F-statistic: 42.66 on 1 and 566 DF,  p-value: 1.451e-10
# compare to pre shelter in place
sj_occupants_per_room_by_block %>% 
  ggplot(aes(
  x = `percent less than 1`,
  y = `% not completely at home pre shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households with 1 or fewer occupant per room",
    y = "Percent devices leaving home on weekdays pre shelter-in-place",
    title = "San Jose: Staying at Home and Room Occupancy Pre Shelter-in-Place"
  )

occupants_model2 <- lm(`% not completely at home pre shelter` ~ `percent less than 1`, sj_occupants_per_room_by_block)
summary(occupants_model2)
## 
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `percent less than 1`, 
##     data = sj_occupants_per_room_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28.3246  -2.6506  -0.2808   2.7536  17.0509 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           62.88485    1.57437  39.943   <2e-16 ***
## `percent less than 1`  0.16329    0.01734   9.418   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.299 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1355, Adjusted R-squared:  0.134 
## F-statistic:  88.7 on 1 and 566 DF,  p-value: < 2.2e-16

Vehicle ownership:

# vehicles
sj_vehicles_by_block %>% 
  ggplot(aes(
  x = `vehicles per capita`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Number of vehicles per capita",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Vehicles Per Capita"
  )

# vehicles - percent with no vehicles
sj_no_vehicles_by_block %>% 
  ggplot(aes(
  x = `percent with vehicles`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with vehicles available",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Vehicle Availability"
  )

vehicles_model <- lm(`% not completely at home` ~ `percent with vehicles`, sj_no_vehicles_by_block)
summary(vehicles_model)
## 
## Call:
## lm(formula = `% not completely at home` ~ `percent with vehicles`, 
##     data = sj_no_vehicles_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.500  -5.226  -0.406   4.579  38.500 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             75.71497    5.14420   14.72  < 2e-16 ***
## `percent with vehicles` -0.25615    0.05393   -4.75 2.59e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.199 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.03833,    Adjusted R-squared:  0.03663 
## F-statistic: 22.56 on 1 and 566 DF,  p-value: 2.587e-06
# compare to pre shelter in place
sj_no_vehicles_by_block %>% 
  ggplot(aes(
  x = `percent with vehicles`,
  y = `% not completely at home pre shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with vehicles available",
    y = "Percent devices leaving home on weekdays pre shelter-in-place",
    title = "San Jose: Social Distancing and Vehicle Availability Pre Shelter-in-Place"
  )

vehicles_model2 <- lm(`% not completely at home pre shelter` ~ `percent with vehicles`, sj_no_vehicles_by_block)
summary(vehicles_model2)
## 
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `percent with vehicles`, 
##     data = sj_no_vehicles_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.5618  -2.9606  -0.0694   3.0006  12.6053 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             63.25942    2.83717  22.297  < 2e-16 ***
## `percent with vehicles`  0.15084    0.02975   5.071 5.37e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.522 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.04346,    Adjusted R-squared:  0.04177 
## F-statistic: 25.72 on 1 and 566 DF,  p-value: 5.37e-07

Multiple regression analysis with income, age, language, and occupants per room

# multiple regression 
modeltest <- lm(sj_ami_by_block$`% not completely at home` ~ sj_ami_by_block$`% over 125,000` + sj_age_by_block$`percent less than 30` + sj_lang_by_block$`% speaking english > well` + sj_occupants_per_room_by_block$`percent less than 1`)
summary(modeltest)
## 
## Call:
## lm(formula = sj_ami_by_block$`% not completely at home` ~ sj_ami_by_block$`% over 125,000` + 
##     sj_age_by_block$`percent less than 30` + sj_lang_by_block$`% speaking english > well` + 
##     sj_occupants_per_room_by_block$`percent less than 1`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.490  -4.658  -0.797   3.973  32.270 
## 
## Coefficients:
##                                                      Estimate Std. Error
## (Intercept)                                          50.72548    4.77236
## sj_ami_by_block$`% over 125,000`                     -0.23965    0.02152
## sj_age_by_block$`percent less than 30`                0.01581    0.04209
## sj_lang_by_block$`% speaking english > well`          0.13432    0.04611
## sj_occupants_per_room_by_block$`percent less than 1` -0.02274    0.04614
##                                                      t value Pr(>|t|)    
## (Intercept)                                           10.629  < 2e-16 ***
## sj_ami_by_block$`% over 125,000`                     -11.135  < 2e-16 ***
## sj_age_by_block$`percent less than 30`                 0.376  0.70739    
## sj_lang_by_block$`% speaking english > well`           2.913  0.00372 ** 
## sj_occupants_per_room_by_block$`percent less than 1`  -0.493  0.62234    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.297 on 563 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2423, Adjusted R-squared:  0.2369 
## F-statistic:    45 on 4 and 563 DF,  p-value: < 2.2e-16

Correlations with increase in staying home

Now I consider looking at correlations with the reduction in percent of devices leaving the home since shelter-in-place started relative to the pre-shelter-in-place levels.

# collect the demographic variables
sj_dem_distancing <- sj_internet_by_block %>% 
  dplyr::select(`percent high speed`, `% not completely at home`, `% Completely at Home`, blockgroup) %>% 
  left_join(sj_education_by_block %>% dplyr::select(blockgroup, `percent associates or higher`)) %>% 
  left_join(sj_ami_by_block %>% dplyr::select(blockgroup, `% over 125,000`)) %>% 
  left_join(sj_ami_by_block %>% dplyr::select(blockgroup, `% over 100,000`)) %>% 
  left_join(sj_ami_by_block %>% dplyr::select(blockgroup, `% over 75,000`)) %>% 
  left_join(sj_age_by_block %>% dplyr::select(blockgroup, `percent less than 30`)) %>% 
  left_join(sj_age_by_block %>% dplyr::select(blockgroup, `percent elderly`)) %>% 
  left_join(sj_lang_by_block %>% dplyr::select(blockgroup, `% not speaking spanish`)) %>% 
  left_join(sj_lang_by_block %>% dplyr::select(blockgroup, `% speaking english > well`)) %>% 
  left_join(sj_no_vehicles_by_block %>% dplyr::select(blockgroup, `percent with vehicles`)) %>%
  left_join(sj_occupants_per_room_by_block %>% dplyr::select(blockgroup, `percent less than 1`))

sj_dem_distancing_pre_post <- sj_dem_distancing %>% 
  left_join(sj_internet_by_block %>% dplyr::select(`% not completely at home pre shelter`, `% Completely at Home Pre Shelter`, blockgroup)) %>% 
  mutate(`% increase in staying completely home` = `% Completely at Home` - `% Completely at Home Pre Shelter`)

saveRDS(sj_dem_distancing_pre_post, "/Users/simonespeizer/Documents/2020 Spring Quarter/CEE 218Z/covid19/Simone_Speizer/sj_socialdistancing_demdata_prepostdifs_manyvars.rds")

# sj_dem_distancing_pre_post <- readRDS("/Users/simonespeizer/Documents/2020 Spring Quarter/CEE 218Z/covid19/Simone_Speizer/sj_socialdistancing_demdata_prepostdifs_manyvars.rds")

Age:

# age
sj_dem_distancing_pre_post %>%
  ggplot(aes(
  x = `percent less than 30`,
  y = `% increase in staying completely home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
labs(
    x = "Percent of residents younger than 30",
    y = "Percent increase in staying completely at home since shelter-in-place",
    title = "San Jose: Social Distancing and Young Age Groups"
  )

young_model_dif <- lm(sj_dem_distancing_pre_post$`% increase in staying completely home` ~ sj_dem_distancing_pre_post$`percent less than 30`)
summary(young_model_dif)
## 
## Call:
## lm(formula = sj_dem_distancing_pre_post$`% increase in staying completely home` ~ 
##     sj_dem_distancing_pre_post$`percent less than 30`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.099  -5.332  -0.019   5.441  30.340 
## 
## Coefficients:
##                                                   Estimate Std. Error t value
## (Intercept)                                       37.37809    1.73272  21.572
## sj_dem_distancing_pre_post$`percent less than 30` -0.29045    0.04392  -6.613
##                                                   Pr(>|t|)    
## (Intercept)                                        < 2e-16 ***
## sj_dem_distancing_pre_post$`percent less than 30` 8.72e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.501 on 567 degrees of freedom
## Multiple R-squared:  0.0716, Adjusted R-squared:  0.06997 
## F-statistic: 43.73 on 1 and 567 DF,  p-value: 8.723e-11
sj_dem_distancing_pre_post %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
  ggplot(aes(
  x = `percent elderly`,
  y = `% increase in staying completely home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of residents 65 and older",
    y = "Percent increase in staying completely at home since shelter-in-place",
    title = "San Jose: Social Distancing and Elderly Population"
  )

elderly_model_dif <- lm(`% increase in staying completely home` ~ `percent elderly`, sj_dem_distancing_pre_post %>% filter(`percent elderly` < 50))
summary(elderly_model_dif)
## 
## Call:
## lm(formula = `% increase in staying completely home` ~ `percent elderly`, 
##     data = sj_dem_distancing_pre_post %>% filter(`percent elderly` < 
##         50))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.371  -5.656  -0.321   6.062  31.123 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       22.25429    0.90241  24.661  < 2e-16 ***
## `percent elderly`  0.31026    0.06231   4.979 8.49e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.636 on 564 degrees of freedom
## Multiple R-squared:  0.04211,    Adjusted R-squared:  0.04041 
## F-statistic: 24.79 on 1 and 564 DF,  p-value: 8.494e-07

Income:

# income - less than $75000
sj_dem_distancing_pre_post %>% 
  ggplot(aes(
  x = `% over 75,000`,
  y = `% increase in staying completely home`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes over $75,000 (50% AMI) annually",
    y = "Percent increase in staying completely at home since shelter-in-place",
    title = "San Jose: Social Distancing and Households Above 50% AMI"
  )

income_75_model_dif <- lm(`% increase in staying completely home` ~ `% over 75,000`, sj_dem_distancing_pre_post)
summary(income_75_model_dif)
## 
## Call:
## lm(formula = `% increase in staying completely home` ~ `% over 75,000`, 
##     data = sj_dem_distancing_pre_post)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.473  -4.471   0.535   4.900  24.178 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      8.25518    1.23696   6.674 5.95e-11 ***
## `% over 75,000`  0.28938    0.01907  15.177  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.257 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2893, Adjusted R-squared:  0.288 
## F-statistic: 230.4 on 1 and 566 DF,  p-value: < 2.2e-16
# income - less than $100000
sj_dem_distancing_pre_post %>% 
  ggplot(aes(
  x = `% over 100,000`,
  y = `% increase in staying completely home`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes over $100,000 (80% AMI) annually",
    y = "Percent increase in staying completely at home since shelter-in-place",
    title = "San Jose: Social Distancing and Households Below 80% AMI"
  )

income_100_model_dif <- lm(`% increase in staying completely home` ~ `% over 100,000`, sj_dem_distancing_pre_post)
summary(income_100_model_dif)
## 
## Call:
## lm(formula = `% increase in staying completely home` ~ `% over 100,000`, 
##     data = sj_dem_distancing_pre_post)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.297  -4.436   0.906   5.318  20.897 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      11.47709    0.95110   12.07   <2e-16 ***
## `% over 100,000`  0.29007    0.01743   16.64   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.026 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3285, Adjusted R-squared:  0.3273 
## F-statistic: 276.9 on 1 and 566 DF,  p-value: < 2.2e-16
# income - less than $125000
sj_dem_distancing_pre_post %>% 
  ggplot(aes(
  x = `% over 125,000`,
  y = `% increase in staying completely home`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes over $125,000 annually",
    y = "Percent increase in staying completely at home since shelter-in-place",
    title = "San Jose: Social Distancing and Households Below $125,000"
  )

income_125_model_dif <- lm(`% increase in staying completely home` ~ `% over 125,000`, sj_dem_distancing_pre_post)
summary(income_125_model_dif)
## 
## Call:
## lm(formula = `% increase in staying completely home` ~ `% over 125,000`, 
##     data = sj_dem_distancing_pre_post)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.150  -4.000   0.622   5.134  21.377 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      13.63687    0.78759   17.32   <2e-16 ***
## `% over 125,000`  0.30726    0.01738   17.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.862 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3556, Adjusted R-squared:  0.3545 
## F-statistic: 312.4 on 1 and 566 DF,  p-value: < 2.2e-16

Language:

# language
sj_dem_distancing_pre_post %>% 
  ggplot(aes(
  x = `% speaking english > well`,
  y = `% increase in staying completely home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals speaking English well",
    y = "Percent increase in staying completely at home since shelter-in-place",
    title = "San Jose: Social Distancing and English Language Ability"
  )

english_ability_model_dif <- lm(`% increase in staying completely home` ~ `% speaking english > well`, sj_dem_distancing_pre_post)
summary(english_ability_model_dif)
## 
## Call:
## lm(formula = `% increase in staying completely home` ~ `% speaking english > well`, 
##     data = sj_dem_distancing_pre_post)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.714  -4.743   0.360   5.067  29.631 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -6.44360    3.75014  -1.718   0.0863 .  
## `% speaking english > well`  0.36828    0.04205   8.759   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.254 on 567 degrees of freedom
## Multiple R-squared:  0.1192, Adjusted R-squared:  0.1176 
## F-statistic: 76.72 on 1 and 567 DF,  p-value: < 2.2e-16
sj_dem_distancing_pre_post %>% 
  ggplot(aes(
  x = `% not speaking spanish`,
  y = `% increase in staying completely home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals not speaking Spanish",
    y = "Percent increase in staying completely at home since shelter-in-place",
    title = "San Jose: Social Distancing and Spanish Language Ability"
  )

spanish_speaking_model_dif <- lm(`% increase in staying completely home` ~ `% not speaking spanish`, sj_dem_distancing_pre_post)
summary(spanish_speaking_model_dif)
## 
## Call:
## lm(formula = `% increase in staying completely home` ~ `% not speaking spanish`, 
##     data = sj_dem_distancing_pre_post)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.698  -3.803   0.725   5.249  25.544 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               7.60830    1.45033   5.246  2.2e-07 ***
## `% not speaking spanish`  0.23948    0.01807  13.255  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.615 on 567 degrees of freedom
## Multiple R-squared:  0.2366, Adjusted R-squared:  0.2352 
## F-statistic: 175.7 on 1 and 567 DF,  p-value: < 2.2e-16

Occupants per room:

# occupants per room
sj_dem_distancing_pre_post %>% 
  ggplot(aes(
  x = `percent less than 1`,
  y = `% increase in staying completely home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households with 1 or fewer occupant per room",
    y = "Percent increase in staying completely at home since shelter-in-place",
    title = "San Jose: Social Distancing and Room Occupancy"
  )

occupants_model_dif <- lm(`% increase in staying completely home` ~ `percent less than 1`, sj_dem_distancing_pre_post)
summary(occupants_model_dif)
## 
## Call:
## lm(formula = `% increase in staying completely home` ~ `percent less than 1`, 
##     data = sj_dem_distancing_pre_post)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.905  -4.893   0.459   5.144  27.142 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -7.61068    3.28780  -2.315    0.021 *  
## `percent less than 1`  0.37568    0.03621  10.376   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.978 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1598, Adjusted R-squared:  0.1583 
## F-statistic: 107.7 on 1 and 566 DF,  p-value: < 2.2e-16

Vehicle ownership:

# vehicles - percent with no vehicles
sj_dem_distancing_pre_post %>% 
  ggplot(aes(
  x = `percent with vehicles`,
  y = `% increase in staying completely home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with vehicles available",
    y = "Percent increase in staying completely at home since shelter-in-place",
    title = "San Jose: Social Distancing and Vehicle Availability"
  )

vehicles_model_dif <- lm(`% increase in staying completely home` ~ `percent with vehicles`, sj_dem_distancing_pre_post)
summary(vehicles_model_dif)
## 
## Call:
## lm(formula = `% increase in staying completely home` ~ `percent with vehicles`, 
##     data = sj_dem_distancing_pre_post)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.983  -5.915   0.056   5.696  29.934 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -12.45555    5.92452  -2.102    0.036 *  
## `percent with vehicles`   0.40699    0.06211   6.552 1.27e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.443 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.07051,    Adjusted R-squared:  0.06886 
## F-statistic: 42.93 on 1 and 566 DF,  p-value: 1.274e-10

Education:

sj_dem_distancing_pre_post %>% 
  ggplot(aes(
  x = `percent associates or higher`,
  y = `% increase in staying completely home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of people with an degree at Associate's level or higher",
    y = "Percent increase in staying completely at home since shelter-in-place",
    title = "San Jose: Social Distancing and Education"
  )

educ_model_dif <- lm(`% increase in staying completely home` ~ `percent associates or higher`, sj_dem_distancing_pre_post)
summary(educ_model_dif)
## 
## Call:
## lm(formula = `% increase in staying completely home` ~ `percent associates or higher`, 
##     data = sj_dem_distancing_pre_post)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.284  -3.059   1.107   5.124  22.190 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    13.14752    0.91413   14.38   <2e-16 ***
## `percent associates or higher`  0.27737    0.01794   15.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.27 on 567 degrees of freedom
## Multiple R-squared:  0.2966, Adjusted R-squared:  0.2954 
## F-statistic: 239.1 on 1 and 567 DF,  p-value: < 2.2e-16

Internet:

sj_dem_distancing_pre_post %>% 
  ggplot(aes(
  x = `percent high speed`,
  y = `% increase in staying completely home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households with broadband such as cable, fiber optic or DSL",
    y = "Percent increase in staying completely at home since shelter-in-place",
    title = "San Jose: Social Distancing and High Speed Internet"
  )

internet_model_dif <- lm(`% increase in staying completely home` ~ `percent high speed`, sj_dem_distancing_pre_post)
summary(internet_model_dif)
## 
## Call:
## lm(formula = `% increase in staying completely home` ~ `percent high speed`, 
##     data = sj_dem_distancing_pre_post)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.453  -4.843   0.409   5.396  26.676 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.61938    2.51490  -1.439    0.151    
## `percent high speed`  0.37057    0.03084  12.016   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.742 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2033, Adjusted R-squared:  0.2019 
## F-statistic: 144.4 on 1 and 566 DF,  p-value: < 2.2e-16

Multiple regression analysis with income, internet, Spanish language ability, and occupants per room.

difs_model <- lm(sj_dem_distancing_pre_post$`% increase in staying completely home` ~ sj_dem_distancing_pre_post$`% over 125,000` + sj_dem_distancing_pre_post$`% not speaking spanish` + sj_dem_distancing_pre_post$`percent less than 1` + sj_dem_distancing_pre_post$`percent high speed`)
summary(difs_model)
## 
## Call:
## lm(formula = sj_dem_distancing_pre_post$`% increase in staying completely home` ~ 
##     sj_dem_distancing_pre_post$`% over 125,000` + sj_dem_distancing_pre_post$`% not speaking spanish` + 
##         sj_dem_distancing_pre_post$`percent less than 1` + sj_dem_distancing_pre_post$`percent high speed`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.260  -3.786   0.914   4.997  21.342 
## 
## Coefficients:
##                                                     Estimate Std. Error t value
## (Intercept)                                          9.35496    3.74626   2.497
## sj_dem_distancing_pre_post$`% over 125,000`          0.24015    0.02483   9.671
## sj_dem_distancing_pre_post$`% not speaking spanish`  0.09663    0.02686   3.598
## sj_dem_distancing_pre_post$`percent less than 1`    -0.04485    0.04817  -0.931
## sj_dem_distancing_pre_post$`percent high speed`      0.04428    0.03808   1.163
##                                                     Pr(>|t|)    
## (Intercept)                                         0.012804 *  
## sj_dem_distancing_pre_post$`% over 125,000`          < 2e-16 ***
## sj_dem_distancing_pre_post$`% not speaking spanish` 0.000349 ***
## sj_dem_distancing_pre_post$`percent less than 1`    0.352196    
## sj_dem_distancing_pre_post$`percent high speed`     0.245436    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.739 on 563 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3789, Adjusted R-squared:  0.3745 
## F-statistic: 85.88 on 4 and 563 DF,  p-value: < 2.2e-16

Testing animating the plot

# another collection for pre shelter in place behavior
sj_dem_distancing_pre_shelter <- sj_dem_distancing %>% 
  dplyr::select(-`% not completely at home`) %>%
  left_join(sj_internet_by_block %>% dplyr::select(`% not completely at home pre shelter`, blockgroup))

# relabel column for leaving home
colnames(sj_dem_distancing_pre_shelter)[14] <- "% not completely at home"

sj_dem_distancing[is.na(sj_dem_distancing)] <- 0
sj_dem_distancing_pre_shelter[is.na(sj_dem_distancing_pre_shelter)] <- 0

# add column indicating before or after shelter in place, then bind the two sets of data
sj_dem_distancing_pre_shelter <- sj_dem_distancing_pre_shelter %>% 
  mutate(
    income_trendline =
      fitted(lm(sj_dem_distancing_pre_shelter$`% not completely at home` ~ sj_dem_distancing_pre_shelter$`% over 125,000`))
  ) %>% 
  cbind(`Before or After Shelter-in-Place` = "before")
sj_dem_distancing <-
  sj_dem_distancing %>%
  mutate(
    income_trendline =
      fitted(lm(sj_dem_distancing$`% not completely at home` ~ sj_dem_distancing$`% over 125,000`))
  ) %>% 
  cbind(`Before or After Shelter-in-Place` = "after") %>% 
  rbind(sj_dem_distancing_pre_shelter)

# try animating
fig <- 
  plot_ly (sj_dem_distancing) %>%
    add_trace(
      x = ~`% over 125,000`, 
      y = ~`% not completely at home`, 
      frame = ~`Before or After Shelter-in-Place`, 
      type = 'scatter', 
      mode = 'markers'
    ) %>% 
    add_trace(
      x = ~`% over 125,000`,
      y = ~income_trendline,
      type = 'scatter',
      mode = 'lines',
      line = list(size = 5, color = 'rgba(255, 165, 0, 1)'),
      frame = ~`Before or After Shelter-in-Place`
    ) %>% 
  animation_button(visible = F)
fig
# # save as rds
# saveRDS(sj_dem_distancing, "/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/sj_sd_dem_data.rds")


# fig <- plot_ly(sj_dem_distancing) %>% 
#   add_trace(
#     x = ~`% over 125,000`,
#     y = ~`% not completely at home`,
#     frame = ~`Before or After Shelter-in-Place`,
#     type = "scatter",
#     mode = "markers",
#     name = "Under $125,000",
#     marker = list(size = 5, color = 'rgba(50, 150, 200, 1)'),
#     visible = T
#   ) %>% 
#   add_trace(
#     x = ~`% over 125,000`,
#     y = fitted(lm(sj_dem_distancing$`% not completely at home` ~ sj_dem_distancing$`% over 125,000`)),
#     name = 'trendline',
#     mode = 'lines',
#     line = list(size = 5, color = 'rgba(255, 165, 0, 1)'),
#     frame = ~`Before or After Shelter-in-Place`,
#     visible = F
#   ) %>%
#   add_trace(
#     x = ~`% not speaking spanish`,
#     y = ~`% not completely at home`,
#     frame = ~`Before or After Shelter-in-Place`,
#     name = "speak spanish",
#     marker = list(size = 5, color = 'rgba(50, 150, 200, 1)'),
#     visible = F
#   ) %>% 
#   add_trace(
#     x = ~`% not speaking spanish`,
#     y = fitted(lm(sj_dem_distancing$`% not completely at home` ~ sj_dem_distancing$`% not speaking spanish`)),
#     name = 'trendline',
#     mode = 'lines',
#     line = list(size = 5, color = 'rgba(255, 165, 0, 1)'),
#     frame = ~`Before or After Shelter-in-Place`,
#     visible = F
#   ) %>% 
#   add_trace(
#     x = ~`percent associates or higher`,
#     y = ~`% not completely at home`,
#     frame = ~`Before or After Shelter-in-Place`,
#     name = "percent higher degree",
#     marker = list(size = 5, color = 'rgba(50, 150, 200, 1)'),
#     visible = F
#   ) %>% 
#   add_trace(
#     x = ~`percent associates or higher`,
#     y = fitted(lm(sj_dem_distancing$`% not completely at home` ~ sj_dem_distancing$`percent associates or higher`)),
#     name = 'trendline',
#     mode = 'lines',
#     line = list(size = 5, color = 'rgba(255, 165, 0, 1)'),
#     frame = ~`Before or After Shelter-in-Place`,
#     visible = F
#   ) %>%
#   add_trace(
#     x = ~`percent high speed`,
#     y = ~`% not completely at home`,
#     frame = ~`Before or After Shelter-in-Place`,
#     name = "percent high speed internet access",
#     marker = list(size = 5, color = 'rgba(50, 150, 200, 1)'),
#     visible = F
#   ) %>% 
#   add_trace(
#     x = ~`percent high speed`,
#     y = fitted(lm(sj_dem_distancing$`% not completely at home` ~ sj_dem_distancing$`percent high speed`)),
#     name = 'trendline',
#     mode = 'lines',
#     line = list(size = 5, color = 'rgba(255, 165, 0, 1)'),
#     frame = ~sj_dem_distancing$`Before or After Shelter-in-Place`,
#     visible = F
#   ) %>%
#   add_trace(
#     x = ~`percent less than 30`,
#     y = ~`% not completely at home`,
#     frame = ~`Before or After Shelter-in-Place`,
#     name = "percent less than 30",
#     marker = list(size = 5, color = 'rgba(50, 150, 200, 1)'),
#     visible = F
#   ) %>% 
#   add_trace(
#     x = ~`percent less than 30`,
#     y = fitted(lm(sj_dem_distancing$`% not completely at home` ~ sj_dem_distancing$`percent less than 30`)),
#     name = 'trendline',
#     mode = 'lines',
#     line = list(size = 5, color = 'rgba(255, 165, 0, 1)'),
#     frame = ~`Before or After Shelter-in-Place`,
#     visible = F
#   ) %>%
#   layout(
#     updatemenus = list(
#       list(
#         active = 0,
#         type = 'buttons',
#         buttons = list(
#           list(
#             label = "Households Under $125,000",
#             method = 'update',
#             args = list(list(visible = c(T, T, F, F, F, F, F, F, F, F)),
#                         list(title = "Under $125,000",
#                              xaxis = list(title = "% Households Under $125,000 in Income")))),
#           list(
#             label = "Speaking Spanish",
#             method = 'update',
#             args = list(list(visible = c(F, F, T, T, F, F, F, F, F, F)),
#                         list(title = "Not Speaking Spanish",
#                              xaxis = list(title = "% Residents Not Speaking Spanish")))),
#           list(
#             label = "Education Level",
#             method = 'update',
#             args= list(list(visible = c(F, F, F, F, T, T, F, F, F, F)),
#                        list(xaxis = list(title = "% Residents With Associate's Degree or Higher")))),
#           list(
#             label = "High Speed Internet",
#             method = 'update',
#             args= list(list(visible = c(F, F, F, F, F, F, T, T, F, F)),
#                        list(xaxis = list(title = "% Households With High Speed Internet Access")))),
#           list(
#             label = "Young Population",
#             method = 'update',
#             args= list(list(visible = c(F, F, F, F, F, F, T, T, F, F)),
#                        list(xaxis = list(title = "% Residents Under Age 30"))))
#           )
#           )
#         ),
#     yaxis = list(title = "% Residents Leaving Home", 
#                  font = list(size = 15)),
#     showlegend = FALSE
#       )
# fig

Experimentation

Experimentation with other variables and other ways of analyzing the social distancing data. First I look at a few other possible variables. I start with units in the structure.

# try getting other variables
# get data on units in structure
sj_units_in_structure_by_block <- getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B25024)"
  ) %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  select(-variable) %>% 
  separate(label, into = c(NA, NA, "units"), sep = "!!") %>% 
  filter(!is.na(units)) %>%
  spread(key = units, value = estimate) %>%
  mutate(total_nums = `1, attached` + `1, detached` + `10 to 19` + `2` + `20 to 49`+ `3 or 4` + `5 to 9`+ `50 or more`+ `Boat, RV, van, etc.`+ `Mobile home`, `percent 20 or more` = (`20 to 49`+`50 or more`)* 100/ total_nums, `percent 1 only` = (`1, attached` + `1, detached`)*100/total_nums, `percent > 1` = 100 - `percent 1 only`) %>%
  left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count))

# plot 
sj_units_in_structure_by_block %>% 
  ggplot(aes(
  x = `percent 20 or more`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of structures with more than 20 units",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and 20 or More Units Per Structure"
  )

summary(lm(`% not completely at home` ~ `percent 20 or more`, sj_units_in_structure_by_block))
## 
## Call:
## lm(formula = `% not completely at home` ~ `percent 20 or more`, 
##     data = sj_units_in_structure_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.551  -5.052  -0.451   4.274  37.149 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          51.15091    0.40431 126.513   <2e-16 ***
## `percent 20 or more`  0.01873    0.02026   0.924    0.356    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.354 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.001507,   Adjusted R-squared:  -0.0002566 
## F-statistic: 0.8545 on 1 and 566 DF,  p-value: 0.3557
sj_units_in_structure_by_block %>% 
  ggplot(aes(
  x = `percent 1 only`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of structures with only one unit",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Only 1 Unit Per Structure"
  )

summary(lm(`% not completely at home` ~ `percent 1 only`, sj_units_in_structure_by_block))
## 
## Call:
## lm(formula = `% not completely at home` ~ `percent 1 only`, data = sj_units_in_structure_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.452  -5.117  -0.322   4.364  38.053 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      55.38663    0.88348  62.692  < 2e-16 ***
## `percent 1 only` -0.05596    0.01125  -4.975 8.68e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.184 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.04189,    Adjusted R-squared:  0.0402 
## F-statistic: 24.75 on 1 and 566 DF,  p-value: 8.681e-07

Household type and size:

# load data on household type and size
sj_house_size_type_by_block <- getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B11016)"
  ) %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  select(-variable) %>% 
  separate(label, into = c(NA, NA, "type", "size"), sep = "!!") %>% 
  filter(!is.na(type))


# household type
sj_house_type_by_block <- sj_house_size_type_by_block %>% 
  filter(is.na(size)) %>% 
  dplyr::select(-size) %>%
  spread(key = type, value = estimate) %>% 
  mutate(`total households` = `Family households` + `Nonfamily households`, `percent nonfamily` = `Nonfamily households` / `total households`) %>%
  left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count))

sj_house_type_by_block %>% 
  ggplot(aes(
  x = `percent nonfamily`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent nonfamily households",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Household Type"
  )

summary(lm(`% not completely at home` ~ `percent nonfamily`, sj_house_type_by_block))
## 
## Call:
## lm(formula = `% not completely at home` ~ `percent nonfamily`, 
##     data = sj_house_type_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.616  -5.108  -0.157   4.261  39.714 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          48.8864     0.6327  77.262  < 2e-16 ***
## `percent nonfamily`  10.1400     2.1962   4.617 4.82e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.208 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.0363, Adjusted R-squared:  0.03459 
## F-statistic: 21.32 on 1 and 566 DF,  p-value: 4.82e-06
# household size
sj_house_size_by_block <- sj_house_size_type_by_block %>% 
  filter(!is.na(size)) %>% 
  dplyr::select(-type) %>%
  group_by(blockgroup, size) %>%
  summarize(`total of this size` = sum(estimate)) %>% 
  spread(key = size, value = `total of this size`) %>%
  mutate(total_nums = `1-person household` + `2-person household` + `3-person household` + `4-person household` + `5-person household`+ `6-person household` + `7-or-more person household`, `percent 5 or more` = (`5-person household`+`6-person household` + `7-or-more person household`)* 100/ total_nums, `percent 1 or 2 only` = (`1-person household` + `2-person household`)*100/total_nums) %>%
  left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count))

sj_house_size_by_block %>% 
  ggplot(aes(
  x = `percent 5 or more`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households with 5 or more people",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Households With 5 or More"
  )

summary(lm(`% not completely at home` ~ `percent 5 or more`, sj_house_size_by_block))
## 
## Call:
## lm(formula = `% not completely at home` ~ `percent 5 or more`, 
##     data = sj_house_size_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.136  -4.979  -0.518   4.175  36.154 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         49.85951    0.56004  89.028  < 2e-16 ***
## `percent 5 or more`  0.08454    0.02513   3.364 0.000821 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.278 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.0196, Adjusted R-squared:  0.01786 
## F-statistic: 11.31 on 1 and 566 DF,  p-value: 0.0008215
sj_house_size_by_block %>% 
  ggplot(aes(
  x = `percent 1 or 2 only`,
  y = `% not completely at home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households with 1 or 2 people",
    y = "Percent devices leaving home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Small Household Size"
  )

summary(lm(`% not completely at home` ~ `percent 1 or 2 only`, sj_house_size_by_block))
## 
## Call:
## lm(formula = `% not completely at home` ~ `percent 1 or 2 only`, 
##     data = sj_house_size_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.847  -5.127  -0.450   4.596  37.830 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           50.13694    0.96771   51.81   <2e-16 ***
## `percent 1 or 2 only`  0.02678    0.02013    1.33    0.184    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.348 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.003118,   Adjusted R-squared:  0.001356 
## F-statistic:  1.77 on 1 and 566 DF,  p-value: 0.1839

Next I consider different ways of looking at the social distancing data. First I try distance traveled.

# try other ways of looking at the social distancing data
# first look at total distance traveled
sj_sd_distance <- sj_socialdistancing %>% 
  filter(date > shelter_start) %>% 
  group_by(origin_census_block_group) %>% 
  summarize(total_dist_traveled = sum(distance_traveled_from_home), device_count = sum(device_count)) %>%
  mutate(total_dist_per_device = total_dist_traveled / device_count)

sj_distance_testing <- left_join(sj_ami_by_block, sj_sd_distance, by = c("blockgroup" = "origin_census_block_group")) %>% left_join(sj_age_by_block %>% select(blockgroup, `percent less than 30`))

sj_distance_testing %>% filter(total_dist_per_device < 500)  %>% 
  ggplot(aes(
  x = `% over 75,000`,
  y = total_dist_per_device
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with incomes over $75,000 (50% AMI) annually",
    y = "Average distance traveled per device during weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Income, Distance Metric"
  )

This is very skewed by outliers, and probably not a useful metric.

Now I consider including devices that traveled <1km as staying at (or near) home.

sj_sd_range <- sj_socialdistancing %>% 
  filter(weekend == F) %>% 
  filter(date > shelter_start) %>%
  mutate(travel_buckets_split = lapply(bucketed_distance_traveled, function(x) strsplit(x, "<1000")[[1]][2]), less_than_1km = lapply(travel_buckets_split, function(x) strsplit(x, ":")[[1]][2]), less_than_1km = lapply(less_than_1km, function(x) strsplit(x, ",")[[1]][1])) %>%
  mutate(less_than_1km = lapply(less_than_1km, function(x) str_remove(x, "[}]")))  %>% # clean a bit more
  mutate(less_than_1km = as.numeric(less_than_1km), less_than_1km = replace_na(less_than_1km, 0)) %>% 
  mutate(home_or_1km = completely_home_device_count + less_than_1km) %>% 
  group_by(origin_census_block_group) %>% 
  summarize(home_or_1km = sum(home_or_1km), device_count = sum(device_count)) %>% 
  mutate(`% Within 1km of Home` = (home_or_1km/device_count*100) %>% round(1), `% farther than 1km` = (100-`% Within 1km of Home`))

# join this with other data
sj_1km_testing <- left_join(sj_ami_by_block, sj_sd_range, by = c("blockgroup" = "origin_census_block_group")) %>% 
  left_join(sj_occupants_per_room_by_block %>% dplyr::select(`percent less than 1`, blockgroup)) %>%
  left_join(sj_age_by_block %>% dplyr::select(`percent less than 30`, blockgroup)) %>%
  left_join(sj_lang_by_block %>% dplyr::select(`% speaking english > well`, blockgroup)) 

# plot with income
sj_1km_testing %>%  
  ggplot(aes(
  x = `% over 75,000`,
  y = `% farther than 1km`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with incomes over $75,000 (50% AMI) annually",
    y = "% of devices going farther than 1km of home, weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Income, 1km Range"
  )

summary(lm(`% farther than 1km` ~ `% over 75,000`, sj_1km_testing))
## 
## Call:
## lm(formula = `% farther than 1km` ~ `% over 75,000`, data = sj_1km_testing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.389  -4.795  -0.606   4.185  34.925 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     64.35928    1.11518   57.71   <2e-16 ***
## `% over 75,000` -0.20909    0.01719  -12.16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.444 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2072, Adjusted R-squared:  0.2058 
## F-statistic:   148 on 1 and 566 DF,  p-value: < 2.2e-16
# plot with age
sj_1km_testing %>%  
  ggplot(aes(
  x = `percent less than 30`,
  y = `% farther than 1km`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of people younger than 30",
    y = "Percent of devices farther than 1km of home during weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Age, 1km Range"
  )

summary(lm(`% farther than 1km` ~ `percent less than 30`, sj_1km_testing))
## 
## Call:
## lm(formula = `% farther than 1km` ~ `percent less than 30`, data = sj_1km_testing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.505  -4.978  -0.362   4.274  39.808 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            44.49223    1.51348  29.397  < 2e-16 ***
## `percent less than 30`  0.17973    0.03836   4.685 3.51e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.299 on 567 degrees of freedom
## Multiple R-squared:  0.03726,    Adjusted R-squared:  0.03557 
## F-statistic: 21.95 on 1 and 567 DF,  p-value: 3.513e-06
# run multiple regression model
modeltest2 <- lm(sj_1km_testing$`% farther than 1km` ~ sj_1km_testing$`% over 75,000` + sj_1km_testing$`percent less than 30` + sj_1km_testing$`% speaking english > well` + sj_1km_testing$`percent less than 1`)
summary(modeltest2)
## 
## Call:
## lm(formula = sj_1km_testing$`% farther than 1km` ~ sj_1km_testing$`% over 75,000` + 
##     sj_1km_testing$`percent less than 30` + sj_1km_testing$`% speaking english > well` + 
##     sj_1km_testing$`percent less than 1`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.646  -4.675  -0.762   4.296  34.809 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                58.57603    4.69256  12.483   <2e-16
## sj_1km_testing$`% over 75,000`             -0.21745    0.02166 -10.039   <2e-16
## sj_1km_testing$`percent less than 30`       0.03813    0.04267   0.894   0.3718
## sj_1km_testing$`% speaking english > well`  0.09468    0.04607   2.055   0.0403
## sj_1km_testing$`percent less than 1`       -0.03946    0.04679  -0.843   0.3994
##                                               
## (Intercept)                                ***
## sj_1km_testing$`% over 75,000`             ***
## sj_1km_testing$`percent less than 30`         
## sj_1km_testing$`% speaking english > well` *  
## sj_1km_testing$`percent less than 1`          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.424 on 563 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2158, Adjusted R-squared:  0.2102 
## F-statistic: 38.72 on 4 and 563 DF,  p-value: < 2.2e-16

It looks like the fit of these selected variables is slightly better for the social distancing data based on not traveling farther than 1km.

Now I also consider “non-work” behavior.

sj_nonworking_by_block <- sj_socialdistancing %>% 
  filter(weekend == F) %>% 
  filter(date > shelter_start) %>%
  mutate(nonworking = device_count - completely_home_device_count - part_time_work_behavior_devices - full_time_work_behavior_devices) %>%
  group_by(origin_census_block_group) %>%
  summarize(nonworking_count = sum(nonworking), total_device = sum(device_count)) %>% 
  mutate(nonworking_percent = nonworking_count*100 / total_device, percent_only_work_home = 100-nonworking_percent) %>%
  left_join(sj_1km_testing %>% dplyr::select(`% over 75,000`, `percent less than 30`, `% speaking english > well`, `percent less than 1`, blockgroup), by = c("origin_census_block_group" = "blockgroup"))


# plot against age and income
sj_nonworking_by_block %>%  
  ggplot(aes(
  x = `% over 75,000`,
  y = nonworking_percent
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with incomes over $75,000 (50% AMI) annually",
    y = "Percent of devices leaving home for non-work purposes during weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Income, Nonworking Behavior"
  )

summary(lm(nonworking_percent ~ `% over 75,000`, sj_nonworking_by_block))
## 
## Call:
## lm(formula = nonworking_percent ~ `% over 75,000`, data = sj_nonworking_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.922  -4.227  -0.758   3.670  33.628 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     52.99603    1.02994   51.46   <2e-16 ***
## `% over 75,000` -0.16521    0.01588  -10.41   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.875 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1606, Adjusted R-squared:  0.1591 
## F-statistic: 108.3 on 1 and 566 DF,  p-value: < 2.2e-16
sj_nonworking_by_block %>%  
  ggplot(aes(
  x = `percent less than 30`,
  y = nonworking_percent
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of people younger than 30",
    y = "Percent of devices leaving home for non-work purposes during weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Age, Nonworking Behavior"
  )

summary(lm(nonworking_percent ~ `percent less than 30`, sj_nonworking_by_block))
## 
## Call:
## lm(formula = nonworking_percent ~ `percent less than 30`, data = sj_nonworking_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.773  -4.144  -0.345   3.400  32.456 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            35.85635    1.33549  26.849  < 2e-16 ***
## `percent less than 30`  0.17865    0.03385   5.277 1.87e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.323 on 567 degrees of freedom
## Multiple R-squared:  0.04682,    Adjusted R-squared:  0.04513 
## F-statistic: 27.85 on 1 and 567 DF,  p-value: 1.871e-07
# multiple regression model
modeltest3 <- lm(sj_nonworking_by_block$nonworking_percent ~ sj_nonworking_by_block$`% over 75,000` + sj_nonworking_by_block$`percent less than 30` + sj_nonworking_by_block$`% speaking english > well` + sj_nonworking_by_block$`percent less than 1`)
summary(modeltest3)
## 
## Call:
## lm(formula = sj_nonworking_by_block$nonworking_percent ~ sj_nonworking_by_block$`% over 75,000` + 
##     sj_nonworking_by_block$`percent less than 30` + sj_nonworking_by_block$`% speaking english > well` + 
##     sj_nonworking_by_block$`percent less than 1`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.758  -4.001  -0.742   3.843  31.685 
## 
## Coefficients:
##                                                    Estimate Std. Error t value
## (Intercept)                                        44.10456    4.31944  10.211
## sj_nonworking_by_block$`% over 75,000`             -0.17101    0.01994  -8.577
## sj_nonworking_by_block$`percent less than 30`       0.08287    0.03928   2.110
## sj_nonworking_by_block$`% speaking english > well`  0.07965    0.04241   1.878
## sj_nonworking_by_block$`percent less than 1`       -0.01102    0.04307  -0.256
##                                                    Pr(>|t|)    
## (Intercept)                                          <2e-16 ***
## sj_nonworking_by_block$`% over 75,000`               <2e-16 ***
## sj_nonworking_by_block$`percent less than 30`        0.0353 *  
## sj_nonworking_by_block$`% speaking english > well`   0.0609 .  
## sj_nonworking_by_block$`percent less than 1`         0.7981    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.833 on 563 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1752, Adjusted R-squared:  0.1693 
## F-statistic: 29.89 on 4 and 563 DF,  p-value: < 2.2e-16

These variables do worse for the percent nonworking metric, which makes sense.